{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Summary Evaluations\n",
    "\n",
    "Download Amazon product reviews and parse the raw data into a pandas dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "from typing import List\n",
    "from urllib.request import urlopen\n",
    "\n",
    "import pandas as pd\n",
    "import tiktoken\n",
    "from langchain.chains.summarize import load_summarize_chain\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.schema import Document\n",
    "from phoenix.experimental.evals import PromptTemplate\n",
    "from phoenix.experimental.evals.evaluators import MapReducer, Refiner\n",
    "from phoenix.experimental.evals.models import OpenAIModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://snap.stanford.edu/data/amazon/Cell_Phones_&_Accessories.txt.gz\"\n",
    "data = []\n",
    "review_data = {}\n",
    "with urlopen(url) as response:\n",
    "    with gzip.open(response, \"rt\", encoding=\"utf-8\") as unzipped:\n",
    "        for line in unzipped:\n",
    "            line = line.strip()\n",
    "            if line:\n",
    "                parts = line.split(\": \", 1)\n",
    "                key = parts[0]\n",
    "                value = parts[1] if len(parts) > 1 else None\n",
    "                review_data[key] = value\n",
    "            else:\n",
    "                if review_data:\n",
    "                    data.append(review_data)\n",
    "                    review_data = {}\n",
    "        if review_data:\n",
    "            data.append(review_data)\n",
    "\n",
    "df = pd.DataFrame(data)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"product/productId\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_product_id = \"B0009B0IX4\"\n",
    "product_df = df[df[\"product/productId\"] == target_product_id]\n",
    "product_df[\"review/summary\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gather documents into chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
    "\n",
    "\n",
    "def gather_documents_into_chunks(\n",
    "    documents: List[str],\n",
    "    max_tokens_per_chunk: int,\n",
    "    separator=\"\\n\\n======\\n\\n\",\n",
    ") -> List[str]:\n",
    "    chunks = []\n",
    "    current_chunk_documents = []\n",
    "    current_chunk_tokens = 0\n",
    "    num_tokens_in_separator = len(encoding.encode(separator))\n",
    "    for document in documents:\n",
    "        document_tokens = len(encoding.encode(document))\n",
    "        tokens_to_add = document_tokens + (\n",
    "            num_tokens_in_separator if current_chunk_documents else 0\n",
    "        )\n",
    "        if current_chunk_tokens + tokens_to_add <= max_tokens_per_chunk:\n",
    "            current_chunk_documents.append(document)\n",
    "            current_chunk_tokens += tokens_to_add\n",
    "        else:\n",
    "            if current_chunk_documents:\n",
    "                chunks.append(separator.join(current_chunk_documents))\n",
    "            current_chunk_documents = [document]\n",
    "            current_chunk_tokens = document_tokens\n",
    "    if current_chunk_documents:\n",
    "        chunks.append(separator.join(current_chunk_documents))\n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = product_df[\"review/text\"].sample(frac=1, random_state=0).to_list()\n",
    "gpt4_context_window_in_tokens = 8192\n",
    "chunks = gather_documents_into_chunks(\n",
    "    documents=documents,\n",
    "    max_tokens_per_chunk=(gpt4_context_window_in_tokens - 1000),  # add in a buffer\n",
    ")[:3]\n",
    "chunks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Summarize with a LangChain \"refine\" chain."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = ChatOpenAI(model=\"gpt-4\")\n",
    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
    "documents = [Document(page_content=chunk) for chunk in chunks]\n",
    "summary = chain.run(documents)\n",
    "print(summary)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Evaluate the summary using `MapReducer`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = OpenAIModel(\n",
    "    model_name=\"gpt-4\",\n",
    ")\n",
    "map_prompt_template = PromptTemplate(\n",
    "    \"You will be given a CONTEXT that contains multiple documents. \"\n",
    "    \"You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. \"\n",
    "    \"You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. \"\n",
    "    \"Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. \"\n",
    "    \"Bear in mind that the SUMMARY may include information from unseen documents. \"\n",
    "    \"Focus on important points, not trivial details.\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    f\"SUMMARY: {summary}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"CONTEXT: {chunk}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"EVALUATION: \"\n",
    ")\n",
    "reduce_prompt_template = PromptTemplate(\n",
    "    \"You will be given a SUMMARY that summarizes a large number of documents. \"\n",
    "    \"You will also be given a list of EVALUATIONS of the quality of that SUMMARY. \"\n",
    "    \"Each evaluation judges the SUMMARY relative to a different subset of the documents it summarizes. \"\n",
    "    \"Given this list, you must provide a single, OVERALL EVALUATION of the quality of the SUMMARY that should take into account the individual EVALUATIONS. \"\n",
    "    'Your OVERALL EVALUATION should judge the quality of the SUMMARY as either \"good\" or \"bad\" and should only contain one of those two words with no additional explanation.'\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    f\"SUMMARY: {summary}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"EVALUATIONS: {mapped}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"OVERALL EVALUATION: \"\n",
    ")\n",
    "evaluator = MapReducer(\n",
    "    model=model,\n",
    "    map_prompt_template=map_prompt_template,\n",
    "    reduce_prompt_template=reduce_prompt_template,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_evaluation = evaluator.evaluate(chunks)\n",
    "print(summary_evaluation)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Evaluate summary using `Refiner`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = OpenAIModel(model_name=\"gpt-4\")\n",
    "initial_prompt_template = PromptTemplate(\n",
    "    \"You will be given a CONTEXT that contains multiple documents. \"\n",
    "    \"You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. \"\n",
    "    \"You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. \"\n",
    "    \"Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. \"\n",
    "    \"Bear in mind that the SUMMARY may include information from unseen documents. \"\n",
    "    \"Focus on important points, not trivial details.\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    f\"SUMMARY: {summary}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"CONTEXT: {chunk}\"\n",
    "    \"=======\"\n",
    "    \"EVALUATION: \"\n",
    ")\n",
    "refine_prompt_template = PromptTemplate(\n",
    "    \"You will be given: \\n\"\n",
    "    \"  - a CONTEXT that contains multiple documents\\n\"\n",
    "    \"  - a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents\\n\"\n",
    "    \"  - an ACCUMULATED EVALUATION of the quality of the SUMMARY relative to other subsets of the summarized documents\\n\"\n",
    "    \"You must provide a REFINED EVALUATION of the quality of the SUMMARY that considers the current CONTEXT. \"\n",
    "    \"Bear in mind that the SUMMARY may include information from unseen documents, although you don't need to mention explicitly mention that. \"\n",
    "    \"Focus on important points, not trivial details.\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    f\"SUMMARY: {summary}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"CONTEXT: {chunk}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"ACCUMULATED EVALUATION: {accumulator}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"REFINED EVALUATION: \"\n",
    ")\n",
    "synthesize_prompt_template = PromptTemplate(\n",
    "    \"You will be given a SUMMARY that summarizes a large number of documents. \"\n",
    "    \"You will also be given a VERBOSE EVALUATION of the quality of that SUMMARY. \"\n",
    "    \"Given this VERBOSE EVALUATION, you must provide a single, CONCISE EVALUATION of the quality of the SUMMARY. \"\n",
    "    'Your CONCISE EVALUATION should judge the quality of the SUMMARY as either \"good\" or \"bad\" and should only contain one of those two words with no additional explanation.'\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    f\"SUMMARY: {summary}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"VERBOSE EVALUATION: {accumulator}\"\n",
    "    \"\\n\\n\"\n",
    "    \"=======\"\n",
    "    \"\\n\\n\"\n",
    "    \"CONCISE EVALUATION: \"\n",
    ")\n",
    "evaluator = Refiner(\n",
    "    model=model,\n",
    "    initial_prompt_template=initial_prompt_template,\n",
    "    refine_prompt_template=refine_prompt_template,\n",
    "    synthesize_prompt_template=synthesize_prompt_template,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_evaluation = evaluator.evaluate(chunks)\n",
    "print(summary_evaluation)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
